Clustering

ratings <- readRDS(file = "data/movieratings.Rds")
str(ratings)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3000 obs. of 6
variables:
## $ userId : int 15 15 15 15 15 15 15 15 15 15 ...
## $ movieId : int 1 32 47 50 110 150 260 296 318 356 ...
## $ rating : num 2 4 5 5 3 3 5 5 2 1 ...
## $ timestamp: int 997938310 997938466 1054449816
997938500 1040205792 997939380 997938437 997938771
997938727 1058250631 ...
## $ title : chr "Toy Story" "Twelve Monkeys" "Seven"
"Usual Suspects, The" ...
## $ genres : Factor w/ 902 levels "(no genres listed)",..:
329 887 889 743 271 453 118 609 700 649 ...

Visualising the raw data

Heatmaps

Manual heatmaps with image

# heatmap
heat <- heatmap(as.matrix((ratings_wide[, -1])), scale = "none", 
    margins = c(12, 5))
row_ordering <- heat$rowInd
col_ordering <- heat$colInd
# reshape long to wide
ratings_wide <- ratings %>% dplyr::select(userId, title, rating) %>% 
    pivot_wider(names_from = title, values_from = rating)
# image wants a matrix of ratings
ratings_mat <- as.matrix(ratings_wide[, -1])

image(t(ratings_mat[row_ordering, col_ordering]))

How does it cluster?

Hierarchical clustering with hclust

# single linkage
hcl_single <- dist(ratings_wide[, -1]) %>% hclust(method = "single")
plot(hcl_single)

# complete linkage
hcl_complete <- dist(ratings_wide[, -1]) %>% hclust(method = "complete")
plot(hcl_complete)

# centroid linkage
hcl_centroid <- dist(ratings_wide[, -1]) %>% hclust(method = "centroid")
plot(hcl_centroid)

Where to cut?

Allocate each row to a cluster

user_clusters <- cutree(hcl_complete, h = 9)
ratings_wide <- ratings_wide %>% mutate(user_clusters = user_clusters)
table(ratings_wide$user_clusters)
## 
##  1  2  3  4  5  6 
##  7 30 32 28  2  1

Review clusters

Redo heatmap after grouping users into clusters

… or keep users separate but reorder

Clustering movies

Transpose and repeat…

# transpose and leave out Id and cluster memb columns
ratings_wide_t <- t(ratings_wide[, -c(1, 32)]) %>% as.data.frame()
# column names should be userId's
names(ratings_wide_t) <- ratings_wide$userId
# complete linkage again (or try others)
hcl_complete <- dist(ratings_wide_t) %>% hclust(method = "complete")

Allocate movies to clusters

title_clusters <- cutree(hcl_complete, k = 6)
table(title_clusters)
## title_clusters
## 1 2 3 4 5 6 
## 9 6 7 5 2 1

# movies in cluster 1
str_c(title_clusters_df[title_clusters_df$title_clus == 1, "title"], 
    collapse = "; ")
## [1] "Toy Story; Braveheart; Apollo 13; Lion King, The;
True Lies; Fugitive, The; Jurassic Park; Aladdin; Dances
with Wolves"
# movies in cluster 2
str_c(title_clusters_df[title_clusters_df$title_clus == 2, "title"], 
    collapse = "; ")
## [1] "Twelve Monkeys; Seven; Terminator 2: Judgment Day;
Matrix, The; Fight Club; Lord of the Rings: , The"

Review clusters

Heatmap with movie clusters

… or just reordering movies

Putting it all together

World’s smallest heatmap

k-means clustering

df <- ratings_wide %>% dplyr::select_at(vars(-starts_with("user")))
kmeansObj <- kmeans(df, centers = 6)
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss"
"tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"

kmeansObj$cluster
## [1] 6 6 4 6 1 2 3 1 5 6 5 5 3 3 6 1 2 5 4 6 5 2 5 3 5 6
6 5 6 1 5 1 5 2 3 1 1
## [38] 6 2 3 4 2 3 2 5 3 6 4 6 5 5 3 1 5 5 6 3 3 1 1 3 4 3
4 6 1 3 4 6 2 6 5 1 4
## [75] 4 3 2 4 6 3 5 4 6 5 1 6 4 3 6 6 5 6 6 6 4 3 5 1 6 2
kmeansObj$centers[, 1:3]
##   Toy Story Twelve Monkeys    Seven
## 1  3.964286       4.035714 4.392857
## 2  3.950000       3.200000 3.400000
## 3  3.277778       3.277778 3.583333
## 4  4.192308       3.346154 3.730769
## 5  3.000000       3.050000 3.175000
## 6  3.480000       3.940000 4.180000

Plotting cluster means

# tidy for ggplot
kmm <- kmeansObj$centers %>% as.data.frame() %>% mutate(user_clus_km = paste0("clust", 
    1:6)) %>% pivot_longer(-user_clus_km, names_to = "title", 
    values_to = "rating")
head(kmm)
## # A tibble: 6 x 3
##   user_clus_km title               rating
##   <chr>        <chr>                <dbl>
## 1 clust1       Toy Story             3.96
## 2 clust1       Twelve Monkeys        4.04
## 3 clust1       Seven                 4.39
## 4 clust1       Usual Suspects, The   4.68
## 5 clust1       Braveheart            4.75
## 6 clust1       Apollo 13             4

similarity between hclust and kmeans

table(user_clusters, kmeansObj$cluster)
##              
## user_clusters  1  2  3  4  5  6
##             1  0  0  2  1  0  4
##             2 13  0  0  7  0 10
##             3  1 10  0  1  9 11
##             4  0  0 15  4  9  0
##             5  0  0  0  0  2  0
##             6  0  0  1  0  0  0

Dimension reduction

Dimension reduction = clustering for variables

# don't need to reorder rows and cols, but will make
# visualizing PCA easier
ratings_scaled_ordered <- scale(ratings_mat)[hcl_row_ordering, 
    hcl_col_ordering]

# do the svd
svd1 <- svd(ratings_scaled_ordered)

# rename svd outputs
u <- svd1$u
v <- svd1$v
d <- diag(svd1$d)

# approximate original data with outer product of first
# singular vector
approx1 <- u[, 1] %*% matrix(d[1, 1], nrow = 1) %*% t(v[, 1])

approx2 <- u[, 1:2] %*% d[1:2, 1:2] %*% t(v[, 1:2])
approx5 <- u[, 1:5] %*% d[1:5, 1:5] %*% t(v[, 1:5])
approx30 <- u %*% d %*% t(v)

How many vectors are needed?

SVD vs PCA

PCs are equal to the right SVs if you scale the data to have mean 0 and sd 1

ratings_ordered <- ratings_mat[hcl_row_ordering, hcl_col_ordering]
pca1 <- prcomp(ratings_ordered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1])

Image compression with PCA

# transform into matrix form
mona_mat <- as.data.frame(mona) %>% pivot_wider(names_from = y, 
    values_from = value) %>% dplyr::select(-x) %>% as.matrix()
image(mona_mat[, nrow(mona_mat):1], col = gray.colors(100))

# svd
svd1 <- svd(mona_mat)
u <- svd1$u
v <- svd1$v
d <- diag(svd1$d)

# number of singular values
nsv <- 1
# approximate original data with outer product of first N
# singular vectors
approx <- u[, 1:nsv] %*% matrix(d[1:nsv, 1:nsv], nrow = nsv) %*% 
    t(v[, 1:nsv])

With 1 SV

With 2 SV

With 5 SV

With 20 SV

With 50 SV

How many SVs is enough?